import pandas as pd
import numpy as np
print('pandas', pd.__version__)
print('numpy', np.__version__)

pandas 1.1.5
numpy 1.19.5


#Create a Dictionary of series
data = {'Name':['Wood', 'Winston', 'Williams', 'White', 'Lorenzen'],
   'Weight':[255,231,195,185,285],
   'No':[49,2,43,14, 13],
   'Pos':['LS', 'QB', 'FS', 'WR', 'QB']}


df1 = pd.DataFrame(data)
print(df1)

       Name  Weight  No Pos
0      Wood     255  49  LS
1   Winston     231   2  QB
2  Williams     195  43  FS
3     White     185  14  WR
4  Lorenzen     285  13  QB


# import pandas as pd
download_url = "https://raw.githubusercontent.com/sdhar-pycourse/py-datascience-biz/main/practise-datasets/bivar.csv"
df2 = pd.read_csv(download_url)


for df in [df1, df2]:
  print('Object Type:', type(df), df.size, 'in kb')
  # /1024*1024

Object Type: <class 'pandas.core.frame.DataFrame'> 20 in kb
Object Type: <class 'pandas.core.frame.DataFrame'> 546 in kb


from google.colab import files
df2.to_csv('filename.csv') 
files.download('filename.csv')


#configure Pandas to display all columns
pd.set_option("display.max.columns", None)
print('Max Num of Cols:', pd.options.display.max_columns)
# Change precision to 2. 
pd.set_option('display.precision', 2)
print('float precision:', pd.options.display.precision)

Max Num of Cols: None
float precision: 2


type(df2.x4)

pandas.core.series.Series


print('Decompising main Pandas Data Structure:')
print()
print('Data Structure:  ', type(df2))
print('Series: ', type(df2['x2']), type(df2.x4))
print('Index: ', type(df2.index))
print('Underlying NumPy array in Series:', type(df2.x2.values))
print('Underlying NumPy array in Index: ', type(df2.index.values))

Decompising main Pandas Data Structure:

Data Structure:   <class 'pandas.core.frame.DataFrame'>
Series:  <class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>
Index:  <class 'pandas.core.indexes.range.RangeIndex'>
Underlying NumPy array in Series: <class 'numpy.ndarray'>
Underlying NumPy array in Index:  <class 'numpy.ndarray'>


print(df2.shape)

(78, 7)


df2.size

546


df2.ndim

2


print(df2.head()) # if no value given it will print first 5 rows

   idx  x3  x2  x1  y2  y1  x4
0    1   1   4   3   1   1   4
1    2   0   2   3   0   1   3
2    3   0   1   3   1   0   2
3    4   1   1   3   1   1   2
4    5   0   1   3   1   1   2


print(df2.head(10))

   idx  x3  x2  x1  y2  y1  x4
0    1   1   4   3   1   1   4
1    2   0   2   3   0   1   3
2    3   0   1   3   1   0   2
3    4   1   1   3   1   1   2
4    5   0   1   3   1   1   2
5    6   0   1   3   0   1   2
6    7   1   2   2   0   1   2
7    8   0   1   3   0   0   2
8    9   0   2   1   0   0   1
9   10   1   2   3   1   1   2


print(df.tail())

    idx  x3  x2  x1  y2  y1  x4
73   74   0   2   1   0   0   1
74   75   0   1   3   0   1   2
75   76   0   4   3   1   0   2
76   77   0   1   2   0   0   1
77   78   1   3   1   1   1   2


df.tail(10)


print(df2.describe())
print()
print(type(df2.describe()))

         idx     x3     x2     x1     y2     y1     x4
count  78.00  78.00  78.00  78.00  78.00  78.00  78.00
mean   39.50   0.10   1.81   2.14   0.37   0.54   1.81
std    22.66   0.31   0.97   0.75   0.49   0.50   0.79
min     1.00   0.00   1.00   1.00   0.00   0.00   1.00
25%    20.25   0.00   1.00   2.00   0.00   0.00   1.00
50%    39.50   0.00   1.00   2.00   0.00   1.00   2.00
75%    58.75   0.00   2.75   3.00   1.00   1.00   2.00
max    78.00   1.00   4.00   3.00   1.00   1.00   4.00

<class 'pandas.core.frame.DataFrame'>


df1.describe(include= 'all')


df2.y2.value_counts()

0    49
1    29
Name: y2, dtype: int64


df1.Pos.value_counts(sort=False)

QB    2
LS    1
WR    1
FS    1
Name: Pos, dtype: int64


print(type(df1.Pos.value_counts(sort=False)))
print(df1.Pos.value_counts(sort=False).index.values)

<class 'pandas.core.series.Series'>
['QB' 'LS' 'WR' 'FS']


df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78 entries, 0 to 77
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   idx     78 non-null     int64
 1   x3      78 non-null     int64
 2   x2      78 non-null     int64
 3   x1      78 non-null     int64
 4   y2      78 non-null     int64
 5   y1      78 non-null     int64
 6   x4      78 non-null     int64
dtypes: int64(7)
memory usage: 4.4 KB


df1.dtypes

Name      object
Weight     int64
No         int64
Pos       object
dtype: object


print(df1)
print()
df1 = pd.DataFrame(data, index=['a', 'b', 'c', 'd', 'e'])
print(df1)

       Name  Weight  No Pos
0      Wood     255  49  LS
1   Winston     231   2  QB
2  Williams     195  43  FS
3     White     185  14  WR
4  Lorenzen     285  13  QB

       Name  Weight  No Pos
a      Wood     255  49  LS
b   Winston     231   2  QB
c  Williams     195  43  FS
d     White     185  14  WR
e  Lorenzen     285  13  QB


df1.reset_index() # this won't make any change in df1
print(df1)

       Name  Weight  No Pos
a      Wood     255  49  LS
b   Winston     231   2  QB
c  Williams     195  43  FS
d     White     185  14  WR
e  Lorenzen     285  13  QB


df1.reset_index(drop = True,inplace=True)
print(df1)

       Name  Weight  No Pos
0      Wood     255  49  LS
1   Winston     231   2  QB
2  Williams     195  43  FS
3     White     185  14  WR
4  Lorenzen     285  13  QB


df3= df1.copy(deep=True)
df3


df3.drop('c', axis=0)


df3.drop(['a','d'])


df3.drop('Name',axis = 1) # axis = 1 is for columns


df3.drop('Name',axis = 'columns')


df3.drop(['Name','No'],axis = 1)


df3.drop('Name',axis = 'columns',inplace=True)
print(df3)

   Weight  No Pos
a     255  49  LS
b     231   2  QB
c     195  43  FS
d     185  14  WR
e     285  13  QB


df3.columns

Index(['Weight', 'No', 'Pos'], dtype='object')

df1


df1.columns

Index(['Name', 'Weight', 'No', 'Pos'], dtype='object')


df3 = df3.rename(columns={'No':'Number', 'Pos': 'Position'})
print(df3)

   Weight  Number Position
a     255      49       LS
b     231       2       QB
c     195      43       FS
d     185      14       WR
e     285      13       QB

df3


df3.sort_values(by='Weight', ascending=False).head(5)


df3.sort_values(by=['Position', 'Weight'])


df3.sort_values(by=['Position', 'Weight'],ascending= False)

df2


df2.loc[4]

idx    5
x3     0
x2     1
x1     3
y2     1
y1     1
x4     2
Name: 4, dtype: int64


df2.loc[2:10:2,['idx', 'y2', 'x4']]


df2.loc[4,['x4', 'y1']]

x4    2
y1    1
Name: 4, dtype: int64


df2.iloc[0:2,0:6:2]


df2.iloc[4, [6,5]]

x4    2
y1    1
Name: 4, dtype: int64


df2.iloc[4, 3:6]

x1    3
y2    1
y1    1
Name: 4, dtype: int64


df2.iloc[4, 3:6:2]

x1    3
y1    1
Name: 4, dtype: int64

df1


df1.loc[df1.Name == 'Williams']


df1.loc[(df1.Pos.isin(['LS', 'QB'])) & (df1.Weight> 215)]


df2.loc[0]

idx    1
x3     1
x2     4
x1     3
y2     1
y1     1
x4     4
Name: 0, dtype: int64


df2.loc[(df2.y1==1) & (df2.x2 >= 2),'x4']

0     4
1     3
6     2
9     2
12    2
13    4
14    3
16    2
25    2
29    3
34    2
35    1
39    2
40    2
43    2
56    2
57    2
58    2
59    3
62    3
64    2
66    3
69    2
70    3
72    2
77    2
Name: x4, dtype: int64


pd.read_excel('water readings.xls').plot('Read Date', 'Gallons Used')

<matplotlib.axes._subplots.AxesSubplot at 0x7fed6c18fa10>

	Name	Weight	No	Pos
count	5	5.00	5.00	5
unique	5	NaN	NaN	4
top	Lorenzen	NaN	NaN	QB
freq	1	NaN	NaN	2
mean	NaN	230.20	24.20	NaN
std	NaN	41.54	20.56	NaN
min	NaN	185.00	2.00	NaN
25%	NaN	195.00	13.00	NaN
50%	NaN	231.00	14.00	NaN
75%	NaN	255.00	43.00	NaN
max	NaN	285.00	49.00	NaN

5. Data Structures for Analysis: Pandas

5.1 Introduction to Pandas

5.2 Reading Data into Pandas

5.2.1 Data Files

5.2.2 Dictionaries to Dataframe

5.2.3 Side Note on `Gitgub Raw`

5.2.4 Read CSVs

5.2.5 Export to CSV

5.2.6 Pandas `.set_options()`

5.2.7. Understanding PAndas Data Structure

5.3 Basic info on Data

5.3.1 `.shape()` et al

5.3.2 `.head()`

5.3.3 `.tail()`

5.3.4 `.describe()`

5.3.5 `.value_counts()`

5.3.6 `.info()`

5.3.6 `.dtypes()`

5.4 Pandas DataFrame indexes/ columns

5.4.1 set/ reset `index`

5.4.2 `.drop`

5.4.2 `rename`

5.4.3 `.sort_values()`

5.5 Querying Pandas Objects

5.5.1 Subset using `.loc` and `.iloc`

5.5.2 Boolean Indexing

	idx	x3	x2	x1	y2	y1	x4
68	69	0	1	2	1	0	2
69	70	0	3	1	1	1	2
70	71	0	2	3	0	1	3
71	72	0	2	2	0	0	1
72	73	1	3	1	1	1	2
73	74	0	2	1	0	0	1
74	75	0	1	3	0	1	2
75	76	0	4	3	1	0	2
76	77	0	1	2	0	0	1
77	78	1	3	1	1	1	2

	Name	Weight	No	Pos
a	Wood	255	49	LS
b	Winston	231	2	QB
c	Williams	195	43	FS
d	White	185	14	WR
e	Lorenzen	285	13	QB

	idx	x3	x2	x1	y2	y1	x4
0	1	1	4	3	1	1	4
1	2	0	2	3	0	1	3
2	3	0	1	3	1	0	2
3	4	1	1	3	1	1	2
4	5	0	1	3	1	1	2
...	...	...	...	...	...	...	...
73	74	0	2	1	0	0	1
74	75	0	1	3	0	1	2
75	76	0	4	3	1	0	2
76	77	0	1	2	0	0	1
77	78	1	3	1	1	1	2

	idx	x3	x2	x1	y2	y1	x4
68	69	0	1	2	1	0	2
69	70	0	3	1	1	1	2
70	71	0	2	3	0	1	3
71	72	0	2	2	0	0	1
72	73	1	3	1	1	1	2
73	74	0	2	1	0	0	1
74	75	0	1	3	0	1	2
75	76	0	4	3	1	0	2
76	77	0	1	2	0	0	1
77	78	1	3	1	1	1	2

	idx	x3	x2	x1	y2	y1	x4
0	1	1	4	3	1	1	4
1	2	0	2	3	0	1	3
2	3	0	1	3	1	0	2
3	4	1	1	3	1	1	2
4	5	0	1	3	1	1	2
...	...	...	...	...	...	...	...
73	74	0	2	1	0	0	1
74	75	0	1	3	0	1	2
75	76	0	4	3	1	0	2
76	77	0	1	2	0	0	1
77	78	1	3	1	1	1	2

5. Data Structures for Analysis: Pandas

5.1 Introduction to Pandas

5.2 Reading Data into Pandas

5.2.1 Data Files

5.2.2 Dictionaries to Dataframe

5.2.3 Side Note on Gitgub Raw

5.2.4 Read CSVs

5.2.5 Export to CSV

5.2.6 Pandas .set_options()

5.2.7. Understanding PAndas Data Structure

5.3 Basic info on Data

5.3.1 .shape() et al

5.3.2 .head()

5.3.3 .tail()

5.3.4 .describe()

5.3.5 .value_counts()

5.3.6 .info()

5.3.6 .dtypes()

5.4 Pandas DataFrame indexes/ columns

5.4.1 set/ reset index

5.4.2 .drop

5.4.2 rename

5.4.3 .sort_values()

5.5 Querying Pandas Objects

5.5.1 Subset using .loc and .iloc

5.5.2 Boolean Indexing

5.2.3 Side Note on `Gitgub Raw`

5.2.6 Pandas `.set_options()`

5.3.1 `.shape()` et al

5.3.2 `.head()`

5.3.3 `.tail()`

5.3.4 `.describe()`

5.3.5 `.value_counts()`

5.3.6 `.info()`

5.3.6 `.dtypes()`

5.4.1 set/ reset `index`

5.4.2 `.drop`

5.4.2 `rename`

5.4.3 `.sort_values()`

5.5.1 Subset using `.loc` and `.iloc`

	idx	x3	x2	x1	y2	y1	x4
68	69	0	1	2	1	0	2
69	70	0	3	1	1	1	2
70	71	0	2	3	0	1	3
71	72	0	2	2	0	0	1
72	73	1	3	1	1	1	2
73	74	0	2	1	0	0	1
74	75	0	1	3	0	1	2
75	76	0	4	3	1	0	2
76	77	0	1	2	0	0	1
77	78	1	3	1	1	1	2

	idx	x3	x2	x1	y2	y1	x4
0	1	1	4	3	1	1	4
1	2	0	2	3	0	1	3
2	3	0	1	3	1	0	2
3	4	1	1	3	1	1	2
4	5	0	1	3	1	1	2
...	...	...	...	...	...	...	...
73	74	0	2	1	0	0	1
74	75	0	1	3	0	1	2
75	76	0	4	3	1	0	2
76	77	0	1	2	0	0	1
77	78	1	3	1	1	1	2